***********************************************
* Title: rwanda_jde_table5-6.do
* Author: Todd Pugatch
* Last update: June 10 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	student_merge_jde.dta
* Outputs: 	rwanda_jde_table5-6.txt
*			rwanda_jde_table5-6_t[4-5].out
* Notes: produces Table 5 and Table 6
************************************************
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"


log using "$temp/rwanda_jde_table5-6.txt", text replace

****************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ACADEMIC OUTCOMES
/*Goal: produce columns 1-3 of Table P4
	Uses endline student survey.
	NOTE: still awaiting administrative data on exam performance to produce columns 1-2.
	Regress academic outcome on treatment status, controlling
		for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).*/
qui use "$cleandata/student_merge_jde.dta", clear

/*get non-normalized control group mean test scores, per request of Della Vigna et al
	team at Berkeley gathering predictions on project outcomes*/
su entrep_grade_el if treatment==0
	
* normalize exam scores to control mean
/*careful because distribution of S6 entrepreneurship exam score is discrete (0-6)*/
foreach y in lastpromexam_el entrep_grade_el S3_exam_bl {
	qui su `y' if treatment==0
	qui gen z`y'=(`y'-r(mean))/r(sd)
}
lab var zentrep_grade_el "S6 entrepreneurship exam score (z)"
lab var zlastpromexam_el "S4/S5 promotional exam score (z)"
lab var zS3_exam_bl "S3 exam score (z)"

* prep missing values for students without baseline outcome
qui gen zS3_exam_bli=zS3_exam_bl
qui su zS3_exam_bl if treatment==0
qui replace zS3_exam_bli=r(mean) if zS3_exam_bl==.	
qui gen S3_exam_blm=(S3_exam_bl==.)
lab var zS3_exam_bli "zS3_exam_bl, imputing control mean for missing values"
lab var S3_exam_blm "missing value for S3_exam_bl"

/*Columns 1-3: exam scores*/
/*Total score (col 1) unavailable*/
local Y "zentrep_grade_el zlastpromexam_el"
local Y0 "zS3_exam_bli S3_exam_blm"
local i=2
foreach y in `Y' {
	qui areg `y' treatment `Y0', a(strata) cluster(school_code)
	scalar define H3_t4_c`i'b=_b[treatment]
	scalar define H3_t4_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H3_t4_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

* ENTREPRENEURSHIP SKILLS
/*Goal: produce columns 4-9 of Table P4
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).*/

* prep missing values for students without baseline outcome
local Y0 "wait_10k_bl wait_20k_bl compound_interest_bl anysavings_bl savings_less5k_bl savings_5kto10k_bl savings_more10k_bl eknowledge_index_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* prep continuous financial variables: winsorize at 99th percentile
winsor savings_usd_cond_el, gen(savings_usd_condw_el) p(0.01) highonly
lab var savings_usd_condw_el "savings_usd_cond_el, winsorized at 99th percentile"

* set up control variables, by table column
local Y04 "wait_10k_bli wait_10k_blm"
local Y05 "wait_20k_bli wait_20k_blm"
local Y06 "compound_interest_bli compound_interest_blm"
local Y07 "anysavings_bli anysavings_blm"
local Y08 "savings_less5k_bli savings_5kto10k_bli savings_more10k_bli savings_less5k_blm savings_5kto10k_blm savings_more10k_blm"
local Y09 "eknowledge_index_bli eknowledge_index_blm"

/*regressions: Columns 4-9*/
local Y "wait_10k wait_20k compound_interest anysavings	savings_usd_condw eknowledge_index"
local i=4 /*initialize loop*/
foreach y in `Y' {
	qui areg `y'_el treatment `Y0`i'', a(strata) cluster(school_code)
	scalar define H3_t4_c`i'b=_b[treatment]
	scalar define H3_t4_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H3_t4_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

* NON-COGNITIVE SKILLS
/*Goal: produce columns 1-5 of Table P5
	Uses endline student survey.
	Regress non-cognitive skill on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).*/
* prep missing values for students without baseline outcome
local Y0 "planned_schl_postsec_bl planned_occup_busorpro_bl planned_business_bl	control_avg_bl grit_raw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

local Y "planned_schl_postsec planned_occup_busorpro planned_business control_avg grit_raw"
local j=1 /*initialize loop*/
foreach y in `Y' {
	qui areg `y'_el treatment `y'_bli `y'_blm, a(strata) cluster(school_code)
	scalar define H3_t5_c`j'b=_b[treatment]
	scalar define H3_t5_c`j'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H3_t5_c`j'p=2*ttail(e(df_r),abs(t))
	local j=`j'+1
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local I=`i'-1 /*#cols in Table 4*/
local J=`j'-1 /*#cols in Table 5*/
local n=`I'+`J'
qui set obs `n'
qui gen hypothesis=3
qui gen table=4
local t5start=`I'+1
qui replace table=5 in `t5start'/`n'
qui bysort table: egen column=seq()
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=2/`I' { /*start at col 2 based on data availability*/
	qui replace b=H3_t4_c`i'b if table==4 & column==`i'
	qui replace se=H3_t4_c`i'se if table==4 & column==`i'
	qui replace pval=H3_t4_c`i'p if table==4 & column==`i' 
}
forval j=1/`J' {
	qui replace b=H3_t5_c`j'b if table==5 & column==`j'
	qui replace se=H3_t5_c`j'se if table==5 & column==`j'
	qui replace pval=H3_t5_c`j'p if table==5 & column==`j' 
}

sort table column

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis table column b se pval bky06_qval

* prep results for inclusion in final regression output
sort table column
forval i=2/`I' {
	scalar define H3_t4_c`i'q=bky06_qval in `i'
}
local j=1
forval k=`t5start'/`n' {
	scalar define H3_t5_c`j'q=bky06_qval in `k'
	local j=`j'+1
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
* ACADEMIC OUTCOMES
/*Goal: produce columns 1-3 of Table P4
	Uses endline student survey.
	NOTE: still awaiting administrative data on exam performance to produce columns 1-2.
	Regress academic outcome on treatment status, controlling
		for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values calculated above.*/

qui use "$cleandata/student_merge_jde.dta", clear

* normalize exam scores to control mean
/*careful because distribution of S6 entrepreneurship exam score is discrete (0-6)*/
foreach y in lastpromexam_el entrep_grade_el S3_exam_bl {
	qui su `y' if treatment==0
	qui gen z`y'=(`y'-r(mean))/r(sd)
}
lab var zentrep_grade_el "S6 entrepreneurship exam score (z)"
lab var zlastpromexam_el "S4/S5 promotional exam score (z)"
lab var zS3_exam_bl "S3 exam score (z)"

* prep missing values for students without baseline outcome
qui gen zS3_exam_bli=zS3_exam_bl
qui su zS3_exam_bl if treatment==0
qui replace zS3_exam_bli=r(mean) if zS3_exam_bl==.	
qui gen S3_exam_blm=(S3_exam_bl==.)
lab var zS3_exam_bli "zS3_exam_bl, imputing control mean for missing values"
lab var S3_exam_blm "missing value for S3_exam_bl"

* define program for obtaining Lee bounds
program getleebounds
	qui predict e if e(sample), residuals
	qui leebounds e treatment, select(insample_el)
	scalar define lb=_b[lower]
	scalar define ub=_b[upper]
	drop e
end

/*Columns 1-3: exam scores*/
/*Total score (col 1) unavailable*/
local stat ""control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub"
local Y "zentrep_grade_el zlastpromexam_el"
local Y0 "zS3_exam_bli S3_exam_blm"

local i=2
foreach y in `Y' {
	qui areg `y' `Y0', a(strata) cluster(school_code)
	getleebounds

	qui areg `y' treatment `Y0', a(strata) cluster(school_code)
	qui su `y' if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	qui su zS3_exam_bl if e(sample)
	scalar define mu_0=r(mean)
	scalar define qv=H3_t4_c`i'q
	outreg2 using "$results/rwanda_jde_table5-6_t4.xls", se excel nolabel nocons addstat(`stat') adec(5) replace
	local i=`i'+1
}

* ENTREPRENEURSHIP SKILLS
/*Goal: produce columns 4-9 of Table P4
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values calculated above.*/

* prep missing values for students without baseline outcome
local Y0 "wait_10k_bl wait_20k_bl compound_interest_bl anysavings_bl savings_less5k_bl savings_5kto10k_bl savings_more10k_bl eknowledge_index_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* prep continuous financial variables: winsorize at 99th percentile
winsor savings_usd_cond_el, gen(savings_usd_condw_el) p(0.01) highonly
lab var savings_usd_condw_el "savings_usd_cond_el, winsorized at 99th percentile"

* set up control variables, by table column
local Y04 "wait_10k_bli wait_10k_blm"
local Y05 "wait_20k_bli wait_20k_blm"
local Y06 "compound_interest_bli compound_interest_blm"
local Y07 "anysavings_bli anysavings_blm"
local Y08 "savings_less5k_bli savings_5kto10k_bli savings_more10k_bli savings_less5k_blm savings_5kto10k_blm savings_more10k_blm"
local Y09 "eknowledge_index_bli eknowledge_index_blm"

/*regressions: Columns 4-9*/
local Y "wait_10k wait_20k compound_interest anysavings	savings_usd_condw eknowledge_index"
local i=4 /*initialize loop*/
foreach y in `Y' {
	qui areg `y'_el treatment `Y0`i'', a(strata) cluster(school_code)
	getleebounds

	qui areg `y'_el treatment `Y0`i'', a(strata) cluster(school_code)
	qui su `y'_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	if ("`y'"!="savings_usd_condw") {/*baseline outcome for column 8 is a vector, so omit from report*/
		qui su `y'_bl if e(sample)
		scalar define mu_0=r(mean)
	}
	scalar define qv=H3_t4_c`i'q /*WHY IS THIS OUTPUTTING ALL 1's?*/
	outreg2 using "$results/rwanda_jde_table5-6_t4.xls", se excel nolabel nocons addstat(`stat') adec(5) append
	local i=`i'+1
}

* NON-COGNITIVE SKILLS
/*Goal: produce columns 1-5 of Table P5
	Uses endline student survey.
	Regress non-cognitive skill on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values calculated above.*/
* prep missing values for students without baseline outcome
local Y0 "planned_schl_postsec_bl planned_occup_busorpro_bl planned_business_bl	control_avg_bl grit_raw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*regressions: Columns 1-5*/
local stat ""control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub"

local Y "planned_schl_postsec planned_occup_busorpro planned_business control_avg grit_raw"
local j=1 /*initialize loop*/
foreach y in `Y' {
	qui areg `y'_el `y'_bli `y'_blm, a(strata) cluster(school_code)
	getleebounds
	
	qui areg `y'_el treatment `y'_bli `y'_blm, a(strata) cluster(school_code)
	qui su `y'_el if treatment==0
	scalar define mu_c=r(mean)
	qui su `y'_bl
	scalar define mu_0=r(mean)
	scalar define qv=H3_t5_c`j'q /*WHY IS THIS OUTPUTTING ALL 1's?*/
	outreg2 using "$results/rwanda_jde_table5-6_t5.xls", se excel nolabel nocons addstat(`stat') adec(5) append
}

local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
